Load data

library("parathyroidSE")
library("EnsDb.Hsapiens.v86")
library("dplyr")
library("ggplot2")

data("parathyroidGenesSE", package = "parathyroidSE")

genes <- read.csv(textConnection(
  "name, group
   ESR1,  estrogen
   ESR2,  estrogen
   CASR,  parathyroid
   VDR,   parathyroid
   JUN,   parathyroid
   CALR,  parathyroid
   ORAI2, parathyroid"), 
  stringsAsFactors = FALSE, strip.white = TRUE)

ens <- ensembldb::select(EnsDb.Hsapiens.v86,
  keys = list(GenenameFilter(genes$name), 
              TxBiotypeFilter("protein_coding")),
  columns = c("GENEID", "GENENAME"))
ens <- 
  dplyr::filter(ens, GENEID %in% rownames(parathyroidGenesSE)) %>%
  mutate(group = genes$group[match(GENENAME, genes$name)])


countData <- assay( parathyroidGenesSE ) 
gene.counts <- t(countData[ens$GENEID, ])
colnames(gene.counts) <- ens$GENENAME
dat <- cbind(data.frame(colData( parathyroidGenesSE)), data.frame(gene.counts))
head(dat)
##         run experiment patient treatment time submission     study    sample
## 1 SRR479052  SRX140503       1   Control  24h  SRA051611 SRP012167 SRS308865
## 2 SRR479053  SRX140504       1   Control  48h  SRA051611 SRP012167 SRS308866
## 3 SRR479054  SRX140505       1       DPN  24h  SRA051611 SRP012167 SRS308867
## 4 SRR479055  SRX140506       1       DPN  48h  SRA051611 SRP012167 SRS308868
## 5 SRR479056  SRX140507       1       OHT  24h  SRA051611 SRP012167 SRS308869
## 6 SRR479057  SRX140508       1       OHT  48h  SRA051611 SRP012167 SRS308870
##   CALR  CASR ESR1 ESR2  JUN ORAI2  VDR
## 1 5055 14525    2    1 1799   632 1167
## 2 6161 16870    3    3 1153  1424 1653
## 3 3333  7798    1    1 1086   309  688
## 4 6407 14299    2    1 1227   974 1407
## 5 3810  9235    4    1 1258   326  702
## 6 4885 12994    2    3  906   814 1235

Initial plot

Plot one of the estrogen related gene's counts (ESR1) with plot aesthetics and faceting to separate patients, treatments and times.

ggplot(dat, aes(col = patient, x = treatment, y = ESR1)) +
  geom_point(size = 3) + 
  facet_grid( . ~ time)

Quiz Answers

From the plot of the parathyroid data, answer the following.

Quiz question 1 : How many patients are there?

Answer: There are four patients.

Quiz question 2 : How many time points are there?

Answer: There are two timepoints (24h, 48h).

Quiz question 3 : There were 3 treatments: "Control", "DPN", and "OHT". How many measurements were taken from patient 2 under the DPN treatment?

Answer: There were three measurements taken for patient 2 (green) under DPN treament.

Question: Make your own plot of VDR versus CASR. (That is CASR, not CALR).

Answer:

ggplot(dat, aes( x = VDR, y = CASR)) +
    geom_point(aes(fill = patient, shape = treatment, alpha=time), size = 3) +
    scale_alpha_manual(values = c(1.0, 0.7)) +
    scale_shape_manual(values = c(21, 22, 24)) +
    guides(fill = guide_legend(override.aes=list(shape=21))) +
    facet_grid( . ~ time)

Quiz question 4 : Which patient has the highest recorded level of CASR?

Answer: Patient 2.

Quiz question 5 : Which of the following pairs of patients seem to be well separated in this plot (i.e., for which two patients you can draw a line on the plot that perfectly separates them)?

Answer: Patient 1 and patient 2, also Patient 1 and patient 3.

Quiz question 6 : You need to know which pairs of patients are well separated with respect to the genes VDR and CASR (i.e., you can draw a line on the plot that perfectly separates the patients). Which of the following methods can help you visualize this?

Answer: Plot VDR versus CASR, and change the shape of the point according to the patient. Plot VDR versus CASR, and color the points according to the patient.

Quiz question 7 : Which patient looks different from the other three when you plot VDR versus ORAI2?

Answer: Patient 2.

ggplot(dat, aes( x = VDR, y = ORAI2)) +
    geom_point(aes(fill = patient, shape = treatment, alpha=time), size = 3) +
    scale_alpha_manual(values = c(1.0, 0.7)) +
    scale_shape_manual(values = c(21, 22, 24)) +
    guides(fill = guide_legend(override.aes=list(shape=21))) +
    facet_grid( . ~ time)

Quiz question 8 : Plot ORAI2 versus JUN. Which can you separate best?

Answer: 24 hours vs. 48 hours.

ggplot(dat, aes( x = JUN, y = ORAI2)) +
    geom_point(aes(fill = time, shape = patient, alpha=treatment), size = 3) +
    scale_alpha_manual(values = c(1.0, 0.7, 0.5)) +
    scale_shape_manual(values = c(21, 22, 23,24)) +
    guides(fill = guide_legend(override.aes=list(shape=21)))

Quiz question 9 : Plot CASR versus (ESR1 + ESR2). Fit a separate linear model for each treatment (Control, DPN, OHT). Which linear models are increasing?

Answer: DPN and OHT.

ggplot(dat, aes( y = CASR, x = ESR1 + ESR2)) +
    geom_point(aes(fill = treatment, color=treatment, shape = patient, alpha=time), size = 3) +
    geom_smooth(aes(color = treatment), method = "lm", se=FALSE)+
    scale_alpha_manual(values = c(1.0, 0.6)) +
    scale_shape_manual(values = c(21, 22, 23,24)) +
    guides(fill = guide_legend(override.aes=list(shape=21)))
## `geom_smooth()` using formula 'y ~ x'

Quiz question 10 : What is the maximum number of shapes that you are allowed to use in ggplot2 by default?

Answer: 6.

df_shape <- data.frame(x=runif(10),y=runif(10), shapes=as.factor(1:10))
ggplot(df_shape, aes(x=x,y=y, shape=shapes)) + geom_point()
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 10. Consider
## specifying shapes manually if you must have them.
## Warning: Removed 4 rows containing missing values (geom_point).

Quiz question 11 : Write the name of the function that you could use to make more than the maximum number of default shapes allowed. Hint: this function has "values" as one of the arguments ____(..., values = (...)).

Answer: scale_shape_manual.

Quiz question 12 : What do Themes do in ggplot2?

Answer: They control non-data components of the plot.

In-class exercise: Customized scatter plot

You will try to recreate a plot from an Economist article showing the relationship between well-being and financial inclusion.

You can find the accompanying article at this link

The data for the exercises EconomistData.csv can be downloaded from the class github repository.

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.3     ✓ purrr   0.3.4
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::collapse()   masks IRanges::collapse()
## x dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
## x dplyr::count()      masks matrixStats::count()
## x dplyr::desc()       masks IRanges::desc()
## x tidyr::expand()     masks S4Vectors::expand()
## x dplyr::filter()     masks ensembldb::filter(), stats::filter()
## x dplyr::first()      masks S4Vectors::first()
## x dplyr::lag()        masks stats::lag()
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## x dplyr::rename()     masks S4Vectors::rename()
## x dplyr::select()     masks ensembldb::select(), AnnotationDbi::select()
## x purrr::simplify()   masks DelayedArray::simplify()
## x dplyr::slice()      masks IRanges::slice()
url <- paste0("https://raw.githubusercontent.com/cme195/cme195.github.io/",
              "master/assets/data/EconomistData.csv")
dat <- read_csv(url)
## Parsed with column specification:
## cols(
##   Country = col_character(),
##   SEDA.Current.level = col_double(),
##   SEDA.Recent.progress = col_double(),
##   Wealth.to.well.being.coefficient = col_double(),
##   Growth.to.well.being.coefficient = col_double(),
##   Percent.of.15plus.with.bank.account = col_double(),
##   EPI_regions = col_character(),
##   Region = col_character()
## )
head(dat)
## # A tibble: 6 x 8
##   Country SEDA.Current.le… SEDA.Recent.pro… Wealth.to.well.… Growth.to.well.…
##   <chr>              <dbl>            <dbl>            <dbl>            <dbl>
## 1 Albania             50               63.3             1.27             1.31
## 2 Algeria             40.6             46.5             0.87             1.03
## 3 Angola              17.8             76.2             0.54             1.21
## 4 Argent…             54.1             49.1             0.91             0.89
## 5 Armenia             43.8             46               1.25             1.11
## 6 Austra…             87.9             40.9             1.07             0.92
## # … with 3 more variables: Percent.of.15plus.with.bank.account <dbl>,
## #   EPI_regions <chr>, Region <chr>

Part 1

  1. Create a scatter plot similar to the one in the article, where the x axis corresponds to percent of people over the age of 15 with a bank account (the Percent.of.15plus.with.bank.account column) and the y axis corresponds to the current SEDA score SEDA.Current.level.
  2. Color all points blue.
  3. Color points according to the Region variable.
  4. Overlay a fitted smoothing trend on top of the scatter plot. Try to change the span argument in geom_smooth to a low value and see what happens.
  5. Overlay a regression line on top of the scatter plot Hint: use geom_smooth with an appropriate method argument.
  6. Facet the previous plot by Region.
#1. Create a scatter plot with percent of people over the age of 15 with a bank 
p <- ggplot(
  dat, aes(x = Percent.of.15plus.with.bank.account, y = SEDA.Current.level)) 
p + geom_point()

#2. Color the points in the previous plot blue.
p + geom_point(color = "blue")

#3. Color the points in the previous plot according to the `Region`.
(p3 <- p + geom_point(aes(color = Region)))

# 4. Overlay a smoothing line on top of the scatter plot using the default method.
p3 + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#4. Changing the span parameter
p3 + geom_smooth(span = 0.2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#5. Overlay a smoothing line on top of the scatter plot using the lm method
(p5 <- p3 + geom_smooth(method = "lm"))
## `geom_smooth()` using formula 'y ~ x'

# 6. Facetting plots
p5 + facet_wrap(~ Region)
## `geom_smooth()` using formula 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf

Part 2: Distribution of categorical variables

  1. Generate a bar plot showing the number of countries included in the dataset from each Region.
ggplot(dat, aes(x = Region)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 15, hjust = 1))

dat <- dat %>%
  mutate(reg = reorder(Region, Region, function(x) -length(x)))
barplot <- ggplot(dat, aes(x = reg)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 15, hjust = 1))
barplot

  1. Rotate the plot so the bars are horizontal
barplot + coord_flip()

Part 3: Distribution of continuous variables

  1. Create boxplots of SEDA scores, SEDA.Current.level separately for each Region.
  2. Overlay points on top of the box plots
  3. The points you added are on top of each other, in order to distinguish them jitter each point by a little bit in the horizontal direction.
  4. Now substitute your boxplot with a violin plot.
plt <- ggplot(dat, aes(x = Region, y = SEDA.Current.level)) + 
  theme(axis.text.x = element_text(angle = 15, hjust = 1))
plt + geom_boxplot()  

plt + geom_boxplot() + geom_point()

plt + geom_boxplot() + geom_jitter(width = 0.1)

plt + geom_violin() + geom_jitter(width = 0.1)

Emulating the Economist 'style'

Below, I will show you how to obtain an 'Economist-look' for your scatter plot in few lines of code. To generate a replicate plot we need to:

  1. Change ordering of the regions, by converting Region column to a factor.
  2. Use seetings for the markers to best match the points on the original Economist plot. Note that the points are bigger and have white borders, and specific fill colors. The following colors match the ones on the plot: colors <- c("#28AADC","#F2583F", "#76C0C1","#24576D", "#248E84","#DCC3AA", "#96503F")
  3. Change the axes ratio.
  4. Change the plot background and theme. Note that ggthemes package has a convenient functions for generating "Economist" style plots, e.g. theme_economist_white().
  5. Format the legend.
  6. Add "Country" labels to the points.
  7. Add a title and format the axes.

First, change order of and lables for Regions

regions <- c("Europe", "Asia", "Oceania", "North America", 
             "Latin America & the Caribbean", "Middle East & North Africa",
             "Sub-Saharan Africa")

# Here we are just modifying labels so that some names are on two lines
region_labels <-  c("Europe", "Asia", "Oceania", "North America",
                    "Latin America & \n the Caribbean", 
                    "Middle East & \n North Africa", "Sub-Saharan \n Africa")
dat <- dat %>%
  mutate(
    Region = as.character(Region),
    Region = factor(Region, levels = regions, labels = region_labels)
  )
custom_colors <- c("#28AADC","#F2583F", "#76C0C1","#24576D", "#248E84",
                   "#DCC3AA","#96503F")
p <- ggplot(
  dat, aes(Percent.of.15plus.with.bank.account, SEDA.Current.level)) +
  geom_point(aes(fill = Region), color = "white", size = 4, pch = 21) +
  geom_smooth(method = "lm", se = FALSE, col = "black", size = 0.5) +
  scale_fill_manual(name = "", values = custom_colors) +
  coord_fixed(ratio = 0.4) +
  scale_x_continuous(name = "% of people aged 15+ with bank account, 2014",
                     limits = c(0, 100),
                     breaks = seq(0, 100, by = 20)) +
  scale_y_continuous(name = "SEDA Score, 100=maximum",
                     limits = c(0, 100),
                     breaks = seq(0, 100, by = 20)) +
  labs(title="Laughing all the way to the bank",
       subtitle="Well-being and financial inclusion* \n 2014-15")
p
## `geom_smooth()` using formula 'y ~ x'

To change the background and theme to match the 'Economist style', you can install the ggthemes package that implements the themes from:

  • Base graphics
  • Tableau
  • Excel
  • Stata
  • Economist
  • Wall Street Journal
  • Edward Tufte
  • Nate Silver's Fivethirtyeight
  • etc.
#install.packages("ggthemes")
library(ggthemes)
(p <- p + theme_economist_white(gray_bg = FALSE))
## `geom_smooth()` using formula 'y ~ x'

Format the legend

p + theme(
  text = element_text(color = "grey35", size = 11),
  legend.text = element_text(size = 10),
  legend.position = c(0.72, 1.12),   
  legend.direction = "horizontal") +
  guides(fill = guide_legend(ncol = 4, byrow = FALSE))
## `geom_smooth()` using formula 'y ~ x'

Add point labels

# Choose a subset of countries
pointsToLabel <- c(
  "Yemen", "Iraq", "Egypt", "Jordan", "Chad", "Congo", "Angola", "Albania",
  "Zimbabwe", "Uganda", "Nigeria", "Uruguay", "Kazakhstan", "India", "Turkey",
  "South Africa", "Kenya", "Russia", "Brazil", "Chile", "Saudi Arabia", 
  "Poland", "China", "Serbia", "United States", "United Kingdom")
# install.packages("ggrepel")
library(ggrepel)
(p <- p + 
    geom_text_repel(
      aes(label = Country), color = "grey20",
      data = dat %>% filter(Country %in% pointsToLabel),
      force = 15))
## `geom_smooth()` using formula 'y ~ x'